* this code creates the survey weights to match the ACS distributions
* we use the raking algorithm 
clear

import excel "C:\Dropbox\Average_target_survey\replication_folder\rawdata\survey_data_2020_09_01_w_responseid_region.xlsx", sheet("survey_data_2020_09_01_w_respon") cellrange(A3:CC5304) firstrow

*ssc install survwgt


** Data from ACS community survey 2018 release
** https://www.census.gov/acs/www/data/data-tables-and-tools/
* go by date
gen month = mofd(dofc(EndDate))
gen day = (dofc(EndDate))
drop if Q3=="Other"

egen day_id=group(day)
sum day_id
global max_days=r(max)

**** Initial weights **
gen preweight=1

save temp.dta, replace

* go through dates
forvalues jj=1/$max_days{
use temp, clear
keep if day_id==`jj'
save temp`jj'.dta, replace
}

* compute weights by day

forvalues jj=1/$max_days{
use temp`jj', clear
** Total survey respondents
count
local N=r(N)

** Gender Target **
gen tot_gender=0
replace tot_gender=0.492*`N' if Q3=="Male"
replace tot_gender=0.508*`N' if Q3=="Female"


** Define Income Groups **
gen     income=1
replace income=2 if Q49=="$50,000 - $99,999" 
replace income=3 if Q49=="$100,000 - $199,999" | Q49=="More than $200,000"  

** Income Target **
gen tot_income=0
replace tot_income= 0.408*`N' if income==1
replace tot_income= 0.300*`N' if income==2
replace tot_income= 0.292*`N' if income==3

** Define Age Groups **
gen age=1
replace age=2 if Q1>=35 & Q1<=44
replace age=3 if Q1>=45 & Q1<=54
replace age=4 if Q1>=55 & Q1<=59
replace age=5 if Q1>=60 

** Age Target **
gen tot_age=0
replace tot_age= 0.457*`N' if age==1
replace tot_age= 0.127*`N' if age==2
replace tot_age= 0.127*`N' if age==3
replace tot_age= 0.066*`N' if age==4
replace tot_age= 0.223*`N' if age==5

** Define Education Groups **

gen educ=1
replace educ=2 if Q48=="Some college, but no degree"
replace educ=3 if Q48=="Master's degree" | Q48=="Bachelor's degree" | Q48=="Doctorate or Professional Degree" 

** Education Targets **
gen tot_educ=0
replace tot_educ=0.385*`N' if educ==1
replace tot_educ=0.203*`N' if educ==2
replace tot_educ=0.412*`N' if educ==3


** Define Region Targets ** 
** 2019 data from https://www.census.gov/popclock/print.php?component=growth&image=//www.census.gov/popclock/share/images/growth_1561939200.png
*** South reduced from 38.3% to 38.2% in order to make fractions sum up to 1
gen tot_region=0
replace tot_region=0.171*`N' if region=="northeast"
replace tot_region=0.208*`N' if region=="midwest"
replace tot_region=0.382*`N' if region=="south"  
replace tot_region=0.239*`N' if region=="west"


* Ethnicity
gen race=5   
replace race=1 if Q51=="White/Caucasian" & Q52=="No"
replace race=2 if Q51=="Asian/Asian American" & Q52=="No"
replace race=3 if Q51=="Black/African American" & Q52=="No"
replace race=4 if Q52=="Yes"

* Ethnicity targets
gen tot_race= 0.602*`N' if race==1
replace tot_race= 0.056*`N' if race==2
replace tot_race= 0.123*`N' if race==3
replace tot_race= 0.183*`N' if race==4
replace tot_race= 0.036*`N' if race==5


* generate weights
survwgt rake preweight, by(Q3 income age educ region race) totvars(tot_gender tot_income tot_age tot_educ tot_region tot_race) generate(weight)  maxrep(25)

save temp_wt`jj'.dta, replace
}

*drop preweight tot_* age income educ race
svyset ResponseId [pw= weight]

* combine back together
use temp_wt1, clear
forvalues jj=2/$max_days{
append using temp_wt`jj', force
save combined_data_with_weights.dta, replace
}

* check demographic distributions by:
*svy: tab 





